library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
library(svglite)
dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")meta = fread("data/December_2023/JAX_RNAseq_ExtraEmbryonic_meta_data.tsv",
data.table = FALSE)
dim(meta)## [1] 90 36
meta[1:2,]## clonal.label library_preparation.label label
## 1 MOK20010-WT GT23-10491 PrS-MOK20010-WT
## 2 MOK20010C-A06 GT23-10506 PrS-MOK20010C-A06
## description
## 1 KOLF2.2 derived primitive syncytium
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) derived primitive syncytium
## differentiated_product_protocol_id model_system timepoint_value
## 1 JAXPD001 PrS 6
## 2 JAXPD001 PrS 6
## timepoint_unit final_timepoint treatment_condition wt_control_status
## 1 days Yes hypoxia WT
## 2 days Yes Not applicable KO
## ko_strategy ko_gene library_preparation.description
## 1 WT WT NA
## 2 CE POU2F3 NA
## library_preparation.library_preparation_protocol_id
## 1 JAXPL001
## 2 JAXPL001
## library_preparation.average_fragment_size
## 1 431
## 2 428
## library_preparation.input_amount_value library_preparation.input_amount_unit
## 1 300 ng
## 2 300 ng
## library_preparation.concentration_value
## 1 47.7
## 2 47.7
## library_preparation.concentration_unit library_preparation.total_yield_value
## 1 nM 244
## 2 nM 244
## library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1 ng 10
## 2 ng 10
## library_preparation.pcr_cycles_for_indexing
## 1 Not available
## 2 Not available
## file_id
## 1 KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001
## 2 POU2F3_CE_B05_GT23-10506_GCAGAATT-TGGCCGGT_S17_L001
## run_id
## 1 20230809_23-robson-005-run2
## 2 20230809_23-robson-005-run2
## clonal.description clonal.parental_name
## 1 KOLF2.2 KOLF2.2J
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) KOLF2.2J
## clonal.clone_id clonal.type clonal.zygosity
## 1 WT iPSC Not applicable
## 2 A06 iPSC Not applicable
## clonal.cell_line_generation_protocol clonal.treatment_condition
## 1 Not applicable Not applicable
## 2 Not applicable Not applicable
## clonal.wt_control_status expression_alteration.label
## 1 WT Not applicable
## 2 KO JAX_POU2F3_Critical_exon
## model_organ
## 1 extra-embryonic primitive syncytial cells
## 2 extra-embryonic primitive syncytial cells
names(meta)## [1] "clonal.label"
## [2] "library_preparation.label"
## [3] "label"
## [4] "description"
## [5] "differentiated_product_protocol_id"
## [6] "model_system"
## [7] "timepoint_value"
## [8] "timepoint_unit"
## [9] "final_timepoint"
## [10] "treatment_condition"
## [11] "wt_control_status"
## [12] "ko_strategy"
## [13] "ko_gene"
## [14] "library_preparation.description"
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"
## [17] "library_preparation.input_amount_value"
## [18] "library_preparation.input_amount_unit"
## [19] "library_preparation.concentration_value"
## [20] "library_preparation.concentration_unit"
## [21] "library_preparation.total_yield_value"
## [22] "library_preparation.total_yield_unit"
## [23] "library_preparation.cdna_pcr_cycles"
## [24] "library_preparation.pcr_cycles_for_indexing"
## [25] "file_id"
## [26] "run_id"
## [27] "clonal.description"
## [28] "clonal.parental_name"
## [29] "clonal.clone_id"
## [30] "clonal.type"
## [31] "clonal.zygosity"
## [32] "clonal.cell_line_generation_protocol"
## [33] "clonal.treatment_condition"
## [34] "clonal.wt_control_status"
## [35] "expression_alteration.label"
## [36] "model_organ"
table(table(meta$clonal.label))##
## 1 2
## 66 12
table(table(meta$library_preparation.label))##
## 1
## 90
table(table(meta$label))##
## 1
## 90
Manually correcting the gene strings in certain column names.
As of the time of running this code, all differences below are due to partial correction of the GHRL gene name.
It should be “GRHL1” instead of “GHRL1”.
It is fixed in the file_id column in meta table, but not fixed in the count data file column names yet.
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1030
## PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1 0
## 2 489
## PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1 0
## 2 603
setdiff(names(cts)[-1], meta$file_id)## [1] "GHRL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001"
## [2] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [3] "GHRL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001"
## [4] "GHRL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
## [5] "GHRL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
## [6] "GHRL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001"
## [7] "GHRL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001"
## [8] "GHRL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## [9] "GHRL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001"
## [10] "GHRL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
setdiff(meta$file_id, names(cts)[-1])## [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [2] "GRHL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001"
## [3] "GRHL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001"
## [4] "GRHL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001"
## [5] "GRHL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001"
## [6] "GRHL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
## [7] "GRHL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
## [8] "GRHL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001"
## [9] "GRHL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
## [10] "GRHL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## starting manually correcting count data file column names
names(cts) = gsub("GHRL", "GRHL", names(cts))
## end the manual correction
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Namemodel_s = meta$model_system
table(model_s, useNA="ifany")## model_s
## ExM PrS
## 12 78
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM PrS ExM.1 PrS.1 ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00 0 0 3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25 923 948 1169
table(row.names(gene_info)==gene_info$Name, useNA="ifany")##
## TRUE
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0 0 3
## ENSG00000271254 923 948 1169
summary(gene_info[,-1])## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.0
## Mean : 376.3 Mean : 255.1 Mean : 530.5 Mean : 585.1
## 3rd Qu.: 120.0 3rd Qu.: 60.0 3rd Qu.: 188.5 3rd Qu.: 182.5
## Max. :512761.0 Max. :154723.0 Max. :797550.5 Max. :379480.0
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1
## Median : 3.0 Median : 3.0 Median : 6.0 Median : 10
## Mean : 608.8 Mean : 712.7 Mean : 751.6 Mean : 1102
## 3rd Qu.: 228.0 3rd Qu.: 233.8 3rd Qu.: 285.0 3rd Qu.: 405
## Max. :886970.0 Max. :456738.2 Max. :928104.0 Max. :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)##
## FALSE TRUE
## FALSE 12757 959
## TRUE 2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 25835 90
gene_info = gene_info[w2kp,]gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(gene_info$Name %in% gene_anno$ensembl_gene_id)##
## TRUE
## 25835
# all ensembl gene IDs are contained in the annotation
setdiff(gene_info$Name, gene_anno$ensembl_gene_id)## character(0)
gene_info = merge(gene_anno, gene_info, by.x="ensembl_gene_id", by.y = "Name",
all.x = FALSE, all.y = TRUE)
dim(gene_info)## [1] 25835 16
gene_info[1:2,]## Key: <ensembl_gene_id>
## ensembl_gene_id geneId chr strand start end
## <char> <char> <char> <char> <int> <int>
## 1: ENSG00000000003 ENSG00000000003.16 chrX - 100627108 100639991
## 2: ENSG00000000419 ENSG00000000419.14 chr20 - 50934867 50959140
## hgnc_symbol
## <char>
## 1: TSPAN6
## 2: DPM1
## description
## <char>
## 1: tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: dolichyl-phosphate mannosyltransferase subunit 1, catalytic [Source:HGNC Symbol;Acc:HGNC:3005]
## ExM_min PrS_min ExM_median PrS_median ExM_q75 PrS_q75 ExM_max PrS_max
## <int> <int> <num> <num> <num> <num> <int> <int>
## 1: 1181 739 1650 1190.0 1730.50 1347.00 1889 1733
## 2: 674 781 744 1216.5 817.25 1359.75 1033 1952
gene_info[gene_info$ExM_min > 2e4, c("ExM_min", "hgnc_symbol")]## ExM_min hgnc_symbol
## <int> <char>
## 1: 22327 PABPC1
## 2: 74260 ACTB
## 3: 20930 HSP90AA1
## 4: 28311 MMP2
## 5: 27746 TIMP3
## 6: 62150 MYH9
## 7: 22345 SERPINE1
## 8: 44101 COL1A1
## 9: 21494 HSPA8
## 10: 28495 EIF4G2
## 11: 21084 KRT18
## 12: 23312 GAPDH
## 13: 52349 SPARC
## 14: 512761 FN1
## 15: 21893 CALD1
## 16: 21476 LRP1
## 17: 67802 AHNAK
## 18: 21821 MACF1
## 19: 42812 COL5A1
## 20: 26322 MAP1B
## 21: 43164 COL4A2
## 22: 41582 FLNB
## 23: 35934 TPM1
## 24: 43713 ITGB1
## 25: 35160 DST
## 26: 115551 EEF1A1
## 27: 33223 ACTC1
## 28: 68632 COL1A2
## 29: 25034 YWHAZ
## 30: 26937 EEF2
## 31: 30376 KRT8
## 32: 21806 CALR
## 33: 21608 ANXA2
## 34: 45334 ACTG1
## 35: 56039 COL4A1
## 36: 42340 FLNA
## 37: 25425 DYNC1H1
## 38: 64575 MT-CO2
## 39: 59989 MT-CYB
## 40: 44890 MT-ND2
## 41: 62659 MT-ND5
## 42: 255443 MT-CO1
## 43: 147599 MT-ND4
## 44: 61144 MT-ND1
## 45: 58951 MT-ATP6
## 46: 120331 MT-CO3
## 47: 27357 NEAT1
## 48: 263389 MALAT1
## 49: 111980
## ExM_min hgnc_symbol
gene_info[gene_info$PrS_min > 2e4, c("PrS_min", "hgnc_symbol")]## PrS_min hgnc_symbol
## <int> <char>
## 1: 41173 PABPC1
## 2: 30591 ACTB
## 3: 25451 PAPOLA
## 4: 23823 SCD
## 5: 28878 EIF4G2
## 6: 37637 KRT18
## 7: 26632 GAPDH
## 8: 51812 AHNAK
## 9: 24707 H19
## 10: 92177 EEF1A1
## 11: 23950 YWHAZ
## 12: 21852 HSP90B1
## 13: 23790 EEF2
## 14: 38248 KRT8
## 15: 27307 CALR
## 16: 20647 ACTG1
## 17: 20508 MT-ND5
## 18: 75374 MT-CO1
## 19: 50209 MT-ND4
## 20: 27870 MT-CO3
## 21: 154723 MALAT1
## 22: 88631
## PrS_min hgnc_symbol
gene_info$gene_symbol = gene_info$hgnc_symbol
table(gene_info$gene_symbol == "")##
## FALSE TRUE
## 19957 5878
table(is.na(gene_info$gene_symbol))##
## FALSE
## 25835
wEns = which(gene_info$gene_symbol == "" | is.na(gene_info$gene_symbol))
gene_info$gene_symbol[wEns] = gene_info$ensembl_gene_id[wEns]
table(gene_info$gene_symbol == "")##
## FALSE
## 25835
table(is.na(gene_info$gene_symbol))##
## FALSE
## 25835
Use each combination of (model system, condition) as DE group.
Run analysis for each DE group separately.
The specific DE groups can be found in the file with naming in the format:
${dataset_name}_DE_n_samples.tsv
There are also DE results between samples under normoxia and hypoxia conditions.
release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"
result_dir = file.path("results", release_name)
processed_output_dir = file.path(result_dir, "processed")
all_result_files = list.files(path=processed_output_dir, pattern=".tsv",
all.files=TRUE, full.names=FALSE)
all_result_files = sort(all_result_files)
regular_DE_files = all_result_files[!grepl("hyp_vs_nor", all_result_files)]
condition_DE_files = all_result_files[grepl("hyp_vs_nor", all_result_files)]
extract_cores <- function(x){
x_split = str_split(x, "_")[[1]]
x_start = 6
x_end = length(x_split)-1
x_d_group = paste(x_split[x_start:(x_end-1)], collapse="_")
x_gene = x_split[x_end]
return(c(x_d_group, x_gene))
}
df_file_info = NULL
for (x in regular_DE_files){
df_file_info = rbind(df_file_info, extract_cores(x))
}
df_file_info = as.data.frame(df_file_info)
colnames(df_file_info) = c("DE_general_group", "gene")
df_file_info## DE_general_group gene
## 1 ExM_day_6_nor ISL1
## 2 PrS_day_6_hyp EPAS1
## 3 PrS_day_6_nor EPAS1
## 4 PrS_day_6_nor FOSB
## 5 PrS_day_6_nor GCM1
## 6 PrS_day_6_nor GRHL1
## 7 PrS_day_6_nor POU2F3
## 8 PrS_day_6_nor PPARG
extract_cores_cond <- function(x){
x_split = str_split(x, "_")[[1]]
x_start = 6
x_end = length(x_split)-4
x_d_group = paste(x_split[x_start:(x_start+2)], collapse="_")
x_gene = x_split[x_end-1]
x_gene_strategy = paste(x_split[(x_end-1):x_end], collapse="_")
return(c(x_d_group, x_gene, x_gene_strategy))
}
df_file_info_cond = NULL
for (x in condition_DE_files){
df_file_info_cond = rbind(df_file_info_cond, extract_cores_cond(x))
}
df_file_info_cond = as.data.frame(df_file_info_cond)
colnames(df_file_info_cond) = c("DE_general_group", "gene", "gene_strategy")
df_file_info_cond## DE_general_group gene gene_strategy
## 1 PrS_day_6 EPAS1 EPAS1_CE
## 2 PrS_day_6 EPAS1 EPAS1_KO
## 3 PrS_day_6 EPAS1 EPAS1_PTC
## 4 PrS_day_6 WT WT_WT
df_size = NULL
figure_dir = file.path("figures", release_name)
volcano_figure_dir = file.path(figure_dir, "volcano_plots")
if (!file.exists(volcano_figure_dir)){
dir.create(volcano_figure_dir, recursive = TRUE)
}Plot the knockout genes from different DE groups all together in one figure.
df_file_info$items = paste(df_file_info$DE_general_group,
df_file_info$gene, sep="_")
df_file_info$items## [1] "ExM_day_6_nor_ISL1" "PrS_day_6_hyp_EPAS1" "PrS_day_6_nor_EPAS1"
## [4] "PrS_day_6_nor_FOSB" "PrS_day_6_nor_GCM1" "PrS_day_6_nor_GRHL1"
## [7] "PrS_day_6_nor_POU2F3" "PrS_day_6_nor_PPARG"
table(table(df_file_info$items))##
## 1
## 8
fc0 = log2(1.5)
p0 = 0.05
gene_symbols = df_file_info$gene
gene_symbols## [1] "ISL1" "EPAS1" "EPAS1" "FOSB" "GCM1" "GRHL1" "POU2F3" "PPARG"
table(gene_symbols %in% gene_info$gene_symbol)##
## TRUE
## 8
genes_w_multi_ensembl = names(which(table(gene_info$gene_symbol)>1))
genes_w_multi_ensembl## [1] "GOLGA8M" "LINC01238" "TNFRSF10A-DT"
df = data.frame(KO = df_file_info$items,
group = df_file_info$DE_general_group,
gene_symbol = df_file_info$gene,
CE_nDE = NA, KO_nDE = NA, PTC_nDE = NA,
CE_padj = NA, CE_log2FoldChange = NA,
KO_padj = NA, KO_log2FoldChange = NA,
PTC_padj = NA, PTC_log2FoldChange = NA)
df$Model = str_extract(df$KO, "^[a-zA-Z0-9]+")
gene_ids = gene_info$ensembl_gene_id[match(df_file_info$gene, gene_info$gene_symbol)]
gene_ids## [1] "ENSG00000016082" "ENSG00000116016" "ENSG00000116016" "ENSG00000125740"
## [5] "ENSG00000137270" "ENSG00000134317" "ENSG00000137709" "ENSG00000132170"
for (k in 1:nrow(df_file_info)){
res = fread(file.path(processed_output_dir,
paste0(release_name, "_", df_file_info$items[k], "_DEseq2.tsv")),
data.table = FALSE)
print(res[1:2,])
gene_id = gene_ids[k]
w_gene = which(res$gene_ID == gene_id)
stopifnot(length(w_gene) == 1)
stopifnot(!(df$gene_symbol[k]%in%genes_w_multi_ensembl))
for(ko1 in c("CE", "KO", "PTC")){
fc = paste0(df$gene_symbol[k], "_", ko1, "_log2FoldChange")
padj = paste0(df$gene_symbol[k], "_", ko1, "_padj")
col_name = paste0(ko1, "_nDE")
df[k,col_name] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
col_name = paste0(ko1, "_log2FoldChange")
df[k,col_name] = res[[fc]][w_gene]
col_name = paste0(ko1, "_padj")
df[k,col_name] = res[[padj]][w_gene]
}
}## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## ISL1_CE_log2FoldChange ISL1_CE_pvalue ISL1_CE_padj ISL1_KO_log2FoldChange
## 1 0.280 0.0102 0.104 0.299
## 2 -0.191 0.5370 0.749 -0.660
## ISL1_KO_pvalue ISL1_KO_padj ISL1_PTC_log2FoldChange ISL1_PTC_pvalue
## 1 0.00615 0.0989 0.224 0.0412
## 2 0.03950 0.2780 -0.160 0.6070
## ISL1_PTC_padj
## 1 0.286
## 2 0.845
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## EPAS1_CE_log2FoldChange EPAS1_CE_pvalue EPAS1_CE_padj EPAS1_KO_log2FoldChange
## 1 -0.371 0.00184 0.0100 -0.397
## 2 -1.460 0.01890 0.0611 -1.960
## EPAS1_KO_pvalue EPAS1_KO_padj EPAS1_PTC_log2FoldChange EPAS1_PTC_pvalue
## 1 0.000912 0.00941 -0.231 0.06600
## 2 0.002040 0.01790 -1.720 0.00877
## EPAS1_PTC_padj
## 1 0.2100
## 2 0.0499
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## EPAS1_CE_log2FoldChange EPAS1_CE_pvalue EPAS1_CE_padj EPAS1_KO_log2FoldChange
## 1 0.109 0.252 0.443 0.00737
## 2 -0.544 0.164 0.333 -0.39500
## EPAS1_KO_pvalue EPAS1_KO_padj EPAS1_PTC_log2FoldChange EPAS1_PTC_pvalue
## 1 0.939 0.988 0.114 0.2270
## 2 0.308 NA -0.759 0.0558
## EPAS1_PTC_padj
## 1 0.547
## 2 0.294
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## FOSB_CE_log2FoldChange FOSB_CE_pvalue FOSB_CE_padj FOSB_KO_log2FoldChange
## 1 -0.156 0.107 0.34 -0.231
## 2 0.251 0.517 0.74 0.361
## FOSB_KO_pvalue FOSB_KO_padj FOSB_PTC_log2FoldChange FOSB_PTC_pvalue
## 1 0.0171 0.129 -0.0168 0.860
## 2 0.3480 0.650 0.2970 0.437
## FOSB_PTC_padj
## 1 0.984
## 2 0.903
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## GCM1_CE_log2FoldChange GCM1_CE_pvalue GCM1_CE_padj GCM1_KO_log2FoldChange
## 1 0.160 0.100 0.187 0.0458
## 2 -0.372 0.367 0.507 -0.3690
## GCM1_KO_pvalue GCM1_KO_padj GCM1_PTC_log2FoldChange GCM1_PTC_pvalue
## 1 0.649 0.773 0.0469 0.637
## 2 0.383 0.546 -0.8320 0.055
## GCM1_PTC_padj
## 1 0.77
## 2 0.13
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## GRHL1_CE_log2FoldChange GRHL1_CE_pvalue GRHL1_CE_padj GRHL1_KO_log2FoldChange
## 1 -0.706 3.08e-11 2.79e-09 -0.805
## 2 1.020 1.30e-02 4.51e-02 1.200
## GRHL1_KO_pvalue GRHL1_KO_padj GRHL1_PTC_log2FoldChange GRHL1_PTC_pvalue
## 1 1.00e-15 2.51e-13 -0.879 5.98e-18
## 2 1.63e-03 1.15e-02 1.070 5.70e-03
## GRHL1_PTC_padj
## 1 1.15e-15
## 2 1.75e-02
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## POU2F3_CE_log2FoldChange POU2F3_CE_pvalue POU2F3_CE_padj
## 1 -0.244 0.0144 0.0578
## 2 0.229 0.5650 0.7050
## POU2F3_KO_log2FoldChange POU2F3_KO_pvalue POU2F3_KO_padj
## 1 -0.162 0.097 0.228
## 2 0.060 0.879 0.928
## POU2F3_PTC_log2FoldChange POU2F3_PTC_pvalue POU2F3_PTC_padj
## 1 -0.163 0.0855 0.366
## 2 0.346 0.3590 0.669
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## PPARG_CE_log2FoldChange PPARG_CE_pvalue PPARG_CE_padj PPARG_KO_log2FoldChange
## 1 0.0708 0.455 0.588 0.063
## 2 0.2240 0.496 0.624 0.752
## PPARG_KO_pvalue PPARG_KO_padj PPARG_PTC_log2FoldChange PPARG_PTC_pvalue
## 1 0.507 0.6680 -0.0541 0.568
## 2 0.023 0.0707 0.4520 0.167
## PPARG_PTC_padj
## 1 0.700
## 2 0.291
print(df)## KO group gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj
## 1 ExM_day_6_nor_ISL1 ExM_day_6_nor ISL1 652 405 432 1.05e-10
## 2 PrS_day_6_hyp_EPAS1 PrS_day_6_hyp EPAS1 4196 2140 2553 1.53e-20
## 3 PrS_day_6_nor_EPAS1 PrS_day_6_nor EPAS1 1372 40 304 2.57e-16
## 4 PrS_day_6_nor_FOSB PrS_day_6_nor FOSB 409 847 121 1.75e-01
## 5 PrS_day_6_nor_GCM1 PrS_day_6_nor GCM1 5299 4486 4255 2.02e-04
## 6 PrS_day_6_nor_GRHL1 PrS_day_6_nor GRHL1 3439 2483 5095 1.35e-15
## 7 PrS_day_6_nor_POU2F3 PrS_day_6_nor POU2F3 2038 590 6 1.71e-07
## 8 PrS_day_6_nor_PPARG PrS_day_6_nor PPARG 4503 3029 4014 8.64e-01
## CE_log2FoldChange KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange
## 1 -1.6300 2.88e-07 -1.350 5.20e-14 -1.870
## 2 -2.5400 7.20e-58 -4.310 1.29e-11 -2.010
## 3 -2.2800 1.27e-65 -4.420 2.14e-10 -1.870
## 4 -0.6810 1.90e-18 -2.970 4.95e-01 0.592
## 5 0.6700 3.65e-01 0.208 1.07e-14 -1.360
## 6 -1.7700 2.82e-56 -3.130 7.09e-25 -2.100
## 7 -1.9700 9.64e-51 -5.960 5.02e-02 -1.110
## 8 0.0235 7.92e-09 0.571 1.98e-02 -0.251
## Model
## 1 ExM
## 2 PrS
## 3 PrS
## 4 PrS
## 5 PrS
## 6 PrS
## 7 PrS
## 8 PrS
p1 <- ggplot(df, aes(x=(PTC_nDE), y=(CE_nDE),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("# of DE genes") +
labs(x="PTC", y="CE") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p2 <- ggplot(df, aes(x=(PTC_nDE), y=(KO_nDE),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("# of DE genes") +
labs(x="PTC", y="KO") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p3 <- ggplot(df, aes(x=CE_log2FoldChange, y=-log10(CE_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, CE") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p4 <- ggplot(df, aes(x=KO_log2FoldChange, y=-log10(KO_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, KO") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p5 <- ggplot(df, aes(x=PTC_log2FoldChange, y=-log10(PTC_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, PTC") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
g_combined <- ggarrange(p1, p2, p3, p4, p5, ncol = 2, nrow = 3)
print(g_combined)Barchart for number of DE genes
df_file_info_cond$items = paste(df_file_info_cond$DE_general_group,
df_file_info_cond$gene_strategy, sep="_")
df_file_info_cond$items## [1] "PrS_day_6_EPAS1_CE" "PrS_day_6_EPAS1_KO" "PrS_day_6_EPAS1_PTC"
## [4] "PrS_day_6_WT_WT"
fc0 = log2(1.5)
p0 = 0.05
df_cond = data.frame(gene_strategy = df_file_info_cond$gene_strategy,
nDE = NA)
for (k in 1:nrow(df_file_info_cond)){
res = fread(file.path(processed_output_dir,
paste0(release_name, "_",
df_file_info_cond$items[k],
"_hyp_vs_nor_DEseq2.tsv")),
data.table = FALSE)
print(res[1:2,])
fc = "hyp_vs_nor_log2FoldChange"
padj = "hyp_vs_nor_padj"
df_cond[k,"nDE"] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
}## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1 -0.584 0.000173 0.000885
## 2 0.353 0.618000 0.751000
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1 -0.447 0.0276 0.106
## 2 -0.355 0.5510 0.732
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1 -0.476 0.00143 0.00652
## 2 0.506 0.38100 0.53700
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1 -0.0979 0.4930 0.6120
## 2 1.1500 0.0105 0.0275
df_cond## gene_strategy nDE
## 1 EPAS1_CE 3518
## 2 EPAS1_KO 2518
## 3 EPAS1_PTC 3664
## 4 WT_WT 6724
p1 <- ggplot(df_cond, aes(x=gene_strategy, y=nDE, fill=gene_strategy)) +
geom_bar(stat="identity")+theme_classic() +
ylab("Number of DE genes")+
ggtitle("DE hyp vs nor") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
print(p1)df_file_info## DE_general_group gene items
## 1 ExM_day_6_nor ISL1 ExM_day_6_nor_ISL1
## 2 PrS_day_6_hyp EPAS1 PrS_day_6_hyp_EPAS1
## 3 PrS_day_6_nor EPAS1 PrS_day_6_nor_EPAS1
## 4 PrS_day_6_nor FOSB PrS_day_6_nor_FOSB
## 5 PrS_day_6_nor GCM1 PrS_day_6_nor_GCM1
## 6 PrS_day_6_nor GRHL1 PrS_day_6_nor_GRHL1
## 7 PrS_day_6_nor POU2F3 PrS_day_6_nor_POU2F3
## 8 PrS_day_6_nor PPARG PrS_day_6_nor_PPARG
fc0## [1] 0.5849625
p0## [1] 0.05
for(k in 1:nrow(df_file_info)){
it_k = df_file_info$items[k]
d_group = df_file_info$DE_general_group[k]
gene_name = df_file_info$gene[k]
print(it_k)
res = fread(file.path(processed_output_dir,
paste0(release_name, "_", it_k, "_DEseq2.tsv")),
data.table = FALSE)
for (ko1 in c("CE", "KO", "PTC")){
res_k = res[,c(1, grep(ko1, names(res)))]
names(res_k) = gsub(paste0(gene_name, "_", ko1, "_"), "", names(res_k))
ww_up = which((res_k$log2FoldChange > fc0) & (res_k$padj < p0))
ww_down = which((res_k$log2FoldChange < ((-1)*fc0)) & (res_k$padj < p0))
res_k$DE = "No"
res_k$DE[ww_up] <- "Up"
res_k$DE[ww_down] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
res_k$delabel = NA
res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID,
gene_info$ensembl_gene_id)]
w_de = which(res_k$DE != "No")
res_k$delabel[w_de] = res_k$gene_symbol[w_de]
print(table(res_k$DE))
if (gene_name=="EPAS1"){
d_group_in_title = d_group
}else{
d_group_in_title = gsub("_nor", "", d_group)
}
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=DE, label=delabel)) +
geom_point() + ggtitle(paste0(d_group_in_title, "\n", gene_name, "_", ko1)) +
geom_text_repel(point.padding = 0.5,
min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20,
show.legend = FALSE) +
scale_color_manual(values=col2use)
print(g2)
}
saved_figure_name = paste0(d_group_in_title, "_", gene_name, "_", ko1, ".svg")
ggsave(file=file.path(volcano_figure_dir, saved_figure_name),
plot=g2, width=4, height=3, units="in")
}## [1] "ExM_day_6_nor_ISL1"
##
## Down No Up
## 312 23190 340
## Warning: Removed 6968 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23190 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 640 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 240 23437 165
## Warning: Removed 8340 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23437 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 396 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 251 23410 181
## Warning: Removed 7420 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23410 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 423 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 7420 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23410 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 423 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_hyp_EPAS1"
##
## Down No Up
## 1930 19130 2266
## Warning: Removed 4990 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19130 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4195 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 834 21186 1306
## Warning: Removed 5434 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21186 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2137 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 1006 20773 1547
## Warning: Removed 5433 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20773 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2550 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 5433 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20773 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2550 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_nor_EPAS1"
##
## Down No Up
## 603 22121 769
## Warning: Removed 7321 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22121 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 1364 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 22 23453 18
## Warning: Removed 11844 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23453 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 36 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 161 23189 143
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23189 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 294 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23189 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 294 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_nor_FOSB"
##
## Down No Up
## 48 23084 361
## Warning: Removed 6866 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23084 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 401 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 256 22646 591
## Warning: Removed 5991 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22646 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 832 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 34 23372 87
## Warning: Removed 6865 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23372 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 6865 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23372 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_nor_GCM1"
##
## Down No Up
## 3036 18194 2263
## Warning: Removed 4570 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18194 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5299 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 2704 19007 1782
## Warning: Removed 4564 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19007 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4486 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 2599 19238 1656
## Warning: Removed 4514 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19238 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4255 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4514 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19238 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4255 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_nor_GRHL1"
##
## Down No Up
## 2055 20054 1384
## Warning: Removed 4556 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20054 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3437 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 1449 21010 1034
## Warning: Removed 4619 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21010 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2482 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 2740 18398 2355
## Warning: Removed 4504 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18398 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5092 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4504 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18398 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5092 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_nor_POU2F3"
##
## Down No Up
## 1098 21455 940
## Warning: Removed 6005 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21455 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2036 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 225 22903 365
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22903 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 589 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 4 23487 2
## Warning: Removed 4505 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23487 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: Removed 4505 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23487 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "PrS_day_6_nor_PPARG"
##
## Down No Up
## 2296 18990 2207
## Warning: Removed 4429 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18990 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4502 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 1634 20464 1395
## Warning: Removed 4613 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20464 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3027 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 2342 19479 1672
## Warning: Removed 4460 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19479 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4013 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4460 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19479 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4013 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
df_file_info_cond## DE_general_group gene gene_strategy items
## 1 PrS_day_6 EPAS1 EPAS1_CE PrS_day_6_EPAS1_CE
## 2 PrS_day_6 EPAS1 EPAS1_KO PrS_day_6_EPAS1_KO
## 3 PrS_day_6 EPAS1 EPAS1_PTC PrS_day_6_EPAS1_PTC
## 4 PrS_day_6 WT WT_WT PrS_day_6_WT_WT
fc0## [1] 0.5849625
p0## [1] 0.05
for(k in 1:nrow(df_file_info_cond)){
it_k = df_file_info_cond$items[k]
d_group = df_file_info_cond$DE_general_group[k]
gene_strategy_name = df_file_info_cond$gene_strategy[k]
print(it_k)
res = fread(file.path(processed_output_dir,
paste0(release_name, "_",
df_file_info_cond$items[k],
"_hyp_vs_nor_DEseq2.tsv")),
data.table = FALSE)
res_k = res[,c(1, grep("hyp_vs_nor", names(res)))]
names(res_k) = gsub("hyp_vs_nor_", "", names(res_k))
ww_up = which((res_k$log2FoldChange > fc0) & (res_k$padj < p0))
ww_down = which((res_k$log2FoldChange < ((-1)*fc0)) & (res_k$padj < p0))
res_k$DE = "No"
res_k$DE[ww_up] <- "Up"
res_k$DE[ww_down] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
res_k$delabel = NA
res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID,
gene_info$ensembl_gene_id)]
w_de = which(res_k$DE != "No")
res_k$delabel[w_de] = res_k$gene_symbol[w_de]
print(table(res_k$DE))
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=DE, label=delabel)) +
geom_point() + ggtitle(paste0(d_group, " ", gene_strategy_name,
"\nhyp v.s. nor")) +
geom_text_repel(point.padding = 0.5,
min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20,
show.legend = FALSE) +
scale_color_manual(values=col2use)
print(g2)
saved_figure_name = paste0(d_group, "_", gene_strategy_name,
"_hyp_vs_nor.svg")
ggsave(file=file.path(volcano_figure_dir, saved_figure_name),
plot=g2, width=4, height=3, units="in")
}## [1] "PrS_day_6_EPAS1_CE"
##
## Down No Up
## 1624 17419 1894
## Warning: Removed 5277 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17419 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3505 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 5277 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17419 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3518 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_EPAS1_KO"
##
## Down No Up
## 1302 18659 1216
## Warning: Removed 6159 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18659 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2516 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 6159 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18659 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2516 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_EPAS1_PTC"
##
## Down No Up
## 1809 17367 1855
## Warning: Removed 4485 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17367 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3659 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4485 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17367 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3662 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "PrS_day_6_WT_WT"
##
## Down No Up
## 3400 14497 3324
## Warning: Removed 2059 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14497 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 6722 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 2059 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14497 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 6724 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7897315 421.8 12621380 674.1 NA 12621380 674.1
## Vcells 19392366 148.0 46420136 354.2 65536 46410901 354.1
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] svglite_2.1.3 readxl_1.4.3
## [3] UpSetR_1.4.0 ComplexHeatmap_2.14.0
## [5] data.table_1.15.4 dplyr_1.1.4
## [7] ggpointdensity_0.1.0 viridis_0.6.5
## [9] viridisLite_0.4.2 DESeq2_1.38.3
## [11] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [13] MatrixGenerics_1.10.0 matrixStats_1.3.0
## [15] GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
## [17] IRanges_2.32.0 S4Vectors_0.36.2
## [19] BiocGenerics_0.44.0 RColorBrewer_1.1-3
## [21] MASS_7.3-60 stringr_1.5.1
## [23] ggpubr_0.6.0 ggrepel_0.9.5
## [25] ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.1-1 ggsignif_0.6.4 rjson_0.2.21
## [4] circlize_0.4.16 XVector_0.38.0 GlobalOptions_0.1.2
## [7] clue_0.3-65 rstudioapi_0.16.0 farver_2.1.2
## [10] bit64_4.0.5 AnnotationDbi_1.60.2 fansi_1.0.6
## [13] codetools_0.2-20 doParallel_1.0.17 cachem_1.1.0
## [16] geneplotter_1.76.0 knitr_1.48 jsonlite_1.8.8
## [19] broom_1.0.6 annotate_1.76.0 cluster_2.1.6
## [22] png_0.1-8 compiler_4.2.3 httr_1.4.7
## [25] backports_1.5.0 Matrix_1.5-4.1 fastmap_1.2.0
## [28] cli_3.6.3 htmltools_0.5.8.1 tools_4.2.3
## [31] gtable_0.3.5 glue_1.7.0 GenomeInfoDbData_1.2.9
## [34] Rcpp_1.0.13 carData_3.0-5 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.6.5 Biostrings_2.66.0
## [40] iterators_1.0.14 xfun_0.47 lifecycle_1.0.4
## [43] rstatix_0.7.2 XML_3.99-0.17 zlibbioc_1.44.0
## [46] scales_1.3.0 ragg_1.2.7 parallel_4.2.3
## [49] yaml_2.3.10 memoise_2.0.1 gridExtra_2.3
## [52] sass_0.4.9 stringi_1.8.4 RSQLite_2.3.7
## [55] highr_0.11 foreach_1.5.2 BiocParallel_1.32.6
## [58] shape_1.4.6.1 rlang_1.1.4 pkgconfig_2.0.3
## [61] systemfonts_1.1.0 bitops_1.0-8 evaluate_0.24.0
## [64] lattice_0.22-6 purrr_1.0.2 labeling_0.4.3
## [67] cowplot_1.1.3 bit_4.0.5 tidyselect_1.2.1
## [70] plyr_1.8.9 magrittr_2.0.3 R6_2.5.1
## [73] generics_0.1.3 DelayedArray_0.24.0 DBI_1.2.3
## [76] pillar_1.9.0 withr_3.0.1 KEGGREST_1.38.0
## [79] abind_1.4-5 RCurl_1.98-1.16 tibble_3.2.1
## [82] crayon_1.5.3 car_3.1-2 utf8_1.2.4
## [85] rmarkdown_2.28 GetoptLong_1.0.5 locfit_1.5-9.10
## [88] blob_1.2.4 digest_0.6.37 xtable_1.8-4
## [91] tidyr_1.3.1 textshaping_0.3.7 munsell_0.5.1
## [94] bslib_0.8.0